### Replication Package for "Why is Intermediating Houses so Difficult? Evidence from iBuyers"
### Buchak, Matvos, Piskorski, and Seru
###
###
### buchak@stanford.edu

### This file creates the analysis data from the raw MLS Data. 

library(data.table)
library(lfe)
library(stargazer)
library(survival)
library(zoo)
library(ggplot2)
library(Hmisc)


# Requires actual underlying data, so this cannot be run.
generateSampledRawData <- function() {
  
  files <- dir('/homes/nber/buchak/bulk/iBuyer/split_mls/')
  
  for(ii in 1:length(files))  {
    if(ii == 1) {
      row.names <- names(temp.in)
    }
    print(paste(ii,'/',length(files)))
    temp.in <- fread(paste0('/homes/nber/buchak/bulk/iBuyer/split_mls/',files[ii]))
    if(ii > 1) {
      names(temp.in) <- row.names
    }
    share.data <- sample_and_scramble(temp.in)
    write.table(share.data,file = paste0('/homes/nber/buchak/bulk/iBuyer/replication_data_samples/split_mls/split_raw_',ii,'.csv'),row.names=F,col.names=T)
  }
}


# Used to sample/scramble actual raw data for purposes of replication package data.
sample_and_scramble <- function(DT, frac = 0.1, seed = 0,type='MLS') {
  stopifnot(is.data.table(DT))
  if (!is.null(seed)) set.seed(seed)
  
  # Sampling is done by property id (lots of work is dont with property fixed effects, or properties over time)
  if(type == 'MLS') {
    all.ids <- unique(DT$cc_property_id)
  } else {
    all.ids <- unique(DT$apnformatted)
  }
  
  # 1) sample ~1% of rows (at least 1 row)
  n.ids <- length(all.ids)
  sampled.ids <- sample(all.ids,size = floor(n.ids * frac),replace = F)
  
  if(type == 'MLS') {
    out <- DT[cc_property_id %in% sampled.ids]
  } else {
    out <- DT[apnformatted %in% sampled.ids]
  }
  n = nrow(out)
  
  # drop keys to avoid surprises
  setkey(out, NULL)
  
  # 2) independently permute each column 
  for (j in 1:ncol(out)) {
    set(out, j = j, value = out[[j]][sample.int(n)])
  }
  
  out
}



createProcessedTransactions <- function() {
  
  myCols = c('zip','standardized_property_type','cc_property_id','customer_tracking_id','cc_list_id',
             'sale_type','city','buyer_agent_name','buyer_office_name','listing_agent_name','listing_office_name',
             'owner_full_name','market_ppsf','record_date_time','date','tax_amount','attom_id','status',
             'mls_days_on_market','fips_county_cd','high_school_name','heating','cooling','property_condition',
             'rooms','bedrooms','floors_in_building','gla_sqft','lot_sqft','year_built','assessed_year',
             'assessed_total_value','mls_cumulative_days_on_market','price','most_recent_sale','property_latitude','property_longitude',
             'most_recent_sale_date','trended_prior_sale','renovation.in.listing','has.photos','listing.length.characters','listing.length.words')
  files <- dir('../data/raw/mls/share/')
  
  for(ii in 1:length(files))  {
    print(paste(ii,'/',length(files)))
    temp.in <- fread(paste0('../data/raw/mls/share/',files[ii]))
    
    # Get / create some columns
    temp.in[,has.photos := as.integer(photo_2 != '')]
    temp.in[,renovation.in.listing := as.integer(grepl(public_listing_comments,pattern = 'renovation|refurbish|remodel',ignore.case = T))]
    temp.in[,listing.length.characters := nchar(public_listing_comments)]
    temp.in[,listing.length.words := lengths(gregexpr("\\W+", public_listing_comments)) + 1]
    temp.in <- temp.in[,myCols,with=F]
    
    # Look at single family only	
    temp.in <- temp.in[grepl(standardized_property_type,pattern = 'Single Family',ignore.case=T)]
    temp.in[,date := as.Date(substr(date,start = 1,stop = 10),format = '%Y/%m/%d')]
    temp.in[,record_date_time := as.Date(substr(record_date_time,start = 1,stop = 10),format = '%Y/%m/%d')]
    temp.in[,most_recent_sale_date := as.Date(substr(most_recent_sale_date,start = 1,stop = 10),format = '%Y/%m/%d')]
    temp.in[,year := year(date)]
    temp.in <- temp.in[year >= 2010]
    
    if(ii == 1) {
      result <- temp.in
    } else {
      result <- rbind(result,temp.in)
    }
    
    
  }
  
  write.table(file = '../data/processed/mls/share/combined_processed.csv',x = result,row.names = F, col.names = T)
  
}


createSpellsDataset <- function() {
  
  data.in <- fread('../data/processed/mls/share/combined_processed.csv')
  
  # Fix dates
  data.in[,date := as.Date(substr(date,start = 1,stop = 10),format = '%Y-%m-%d')]
  data.in[,most_recent_sale_date := as.Date(substr(most_recent_sale_date,start = 1,stop = 10),format = '%Y-%m-%d')]
  
  # Get iBuyers
  match.string <- paste(c('opendoor','open door','\\<od [a-z].*',
                          'offerpad','offer pad',
                          'redfin','red fin',
                          'zillow',
                          'knock'),collapse = '|')
  data.in[,iBuyer.buyer := as.integer(grepl(buyer_office_name,pattern = match.string,ignore.case=T) |
                                        grepl(buyer_agent_name,pattern = match.string,ignore.case=T)	) ]
  
  data.in[,iBuyer.seller := as.integer(grepl(listing_agent_name,pattern = match.string,ignore.case=T) |
                                         grepl(listing_agent_name,pattern = match.string,ignore.case=T) | 
                                         grepl(owner_full_name,pattern = match.string,ignore.case=T)) ]		
  
  # Create sale spell groupings
  data.in <- data.in[order(cc_property_id,date)]
  
  data.in[,ones := 1]
  data.in[,listing.num := cumsum(ones),by='cc_list_id']
  data.in$ones <- NULL
  
  # Information about the spells	
  data.in[,total.listings := max(listing.num),by='cc_list_id']
  data.in[,total.nonsale.listings := sum(status != 'Sale'),by='cc_list_id']                             # listings that aren't marked as sales
  data.in[,first.listing.price := max( (listing.num == 1) * price),by='cc_list_id']                     # price for the first listing
  data.in[,last.listing.price := max( (listing.num == total.nonsale.listings) * price),by='cc_list_id'] # price for the last listing that isn't a sale
  data.in[,sale.price := max( (status == 'Sale') * price),by='cc_list_id']                              # price on the listing that is marked as a sale
  data.in[,has.sale := max( as.integer(status == 'Sale') ),by='cc_list_id']                             # indicator for one of the listings is marked as a sale
  data.in[has.sale == 0,sale.price := NA]                                                               # make sure, if there's no sale, sale price is NA
  data.in[,iBuyer.buyer := max(iBuyer.buyer,na.rm=T),by='cc_list_id']                                   
  data.in[,iBuyer.seller := max(iBuyer.seller,na.rm=T),by='cc_list_id']
  
  # Get first, last, and sale listing information.
  # The "last" listing is the listing that occurs directly before the sale
  # The "sale" listing is the listing that is marked as a sale
  first.listing <- data.in[listing.num == 1,c('cc_list_id','date')]
  last.listing <- data.in[listing.num == total.nonsale.listings,c('cc_list_id','date')]
  sale.listing <- data.in[status == 'Sale',c('cc_list_id','date')]
  sale.listing <- sale.listing[!duplicated(cc_list_id)]
  names(first.listing) <- c('cc_list_id','first.listing.date')
  names(last.listing) <- c('cc_list_id','last.listing.date')
  names(sale.listing) <- c('cc_list_id','sale.date')
  first.and.last <- merge(first.listing,last.listing,by=c('cc_list_id'))
  first.and.last <- merge(first.and.last,sale.listing,by=c('cc_list_id'),all.x=T)
  
  # Collapse by spell.id
  bySpell <- data.in[listing.num == 1,c('cc_property_id','zip','cc_list_id','date','owner_full_name','buyer_agent_name','buyer_office_name','listing_agent_name','listing_office_name',
                                        'iBuyer.buyer','iBuyer.seller','city','most_recent_sale','most_recent_sale_date',
                                        'standardized_property_type','sale_type','mls_days_on_market','market_ppsf','mls_cumulative_days_on_market','renovation.in.listing','has.photos',
                                        'total.listings','first.listing.price','last.listing.price','has.sale','sale.price',
                                        'tax_amount','fips_county_cd','high_school_name','heating','cooling','property_condition','listing.length.characters','listing.length.words',
                                        'rooms','bedrooms','floors_in_building','gla_sqft','lot_sqft','year_built','assessed_year','assessed_total_value'),with=F]
  
  bySpell <- merge(bySpell,first.and.last,by='cc_list_id')
  
  bySpell[,days.first.last.listing := as.numeric(last.listing.date - first.listing.date)]
  bySpell[,days.last.listing.sale  := as.numeric(sale.date - last.listing.date)]
  bySpell[,days.first.sale.listing := as.numeric(sale.date - first.listing.date)]
  
  bySpell <- bySpell[sale_type == 'Fair Market']
  bySpell[,first.listing.year := year(first.listing.date)]
  bySpell <- bySpell[is.na(most_recent_sale) | most_recent_sale > 0]
  bySpell <- bySpell[is.na(most_recent_sale_date) | as.numeric(first.listing.date - most_recent_sale_date) > 0]
  
  
  # Remove people seling to themselves...
  bySpell <- bySpell[gsub(listing_office_name,pattern=' ',replace='') != gsub(buyer_office_name,pattern=' ',replace='')]
  
  # Some derived things, like % markup, annualized
  bySpell[,t.last.sale.first.list := as.numeric(first.listing.date - most_recent_sale_date) / 365]
  bySpell[,t.last.sale.last.list := as.numeric(last.listing.date - most_recent_sale_date) / 365]
  bySpell[,t.last.sale.this.sale := as.numeric(sale.date - most_recent_sale_date) / 365]
  
  # Markups: First list to last list. First list to sale. Last list to sale.  Last sale to first list, last sale to this sale.
  bySpell[,last.to.first.listing := last.listing.price / first.listing.price - 1]
  bySpell[,last.to.first.listing.ann := last.to.first.listing / (days.first.last.listing / 365)]
  bySpell[,sale.to.last.listing  := sale.price / last.listing.price - 1]
  bySpell[,sale.to.last.listing.ann  := sale.to.last.listing / (days.last.listing.sale / 365)]
  bySpell[,sale.to.first.listing := sale.price / first.listing.price - 1]
  bySpell[,sale.to.first.listing.ann := sale.to.first.listing / (days.first.sale.listing / 365)]
  bySpell[,first.list.to.last.sale := first.listing.price / most_recent_sale - 1]
  bySpell[,first.list.to.last.sale.ann := first.list.to.last.sale / t.last.sale.first.list]
  bySpell[,sale.to.last.sale := sale.price / most_recent_sale - 1]
  bySpell[,sale.to.last.sale.ann := sale.to.last.sale / t.last.sale.this.sale]
  
  bySpell[,listing.month := month(first.listing.date)]
  bySpell[,sell.month    := month(sale.date)]
  bySpell[,sell.quarter    := quarter(sale.date)]
  bySpell[,listing.quarter := quarter(first.listing.date)]
  
  
  bySpell <- bySpell[(is.na(t.last.sale.first.list) | t.last.sale.first.list > 0) & 
                       first.listing.price > 0 & last.listing.price > 0 & # no crazy listing prices
                       (is.na(sale.price) | (sale.price > 0 & sale.price < 1.5e6)) & # want sale price either not existing or > 0 
                       first.listing.price < 1.5e6 & last.listing.price < 1.5e6 & gla_sqft < 10000] # Don't want absurdly high prices
  
  bySpell[,most_recent_sale_year := year(most_recent_sale_date)]
  bySpell[,age := first.listing.year - year_built]
  bySpell[,age.bucket := cut(age,breaks = c(-999,0,5,10,15,25,50,100,1000))]
  
  bySpell[,size.bin := cut(gla_sqft,breaks = c(0,5000,10000,25000,1000000))]
  bySpell[,size.bin := factor(size.bin,levels(size.bin)[c(4,1:3)])]
  bySpell <- bySpell[order(cc_property_id,first.listing.date)]
  bySpell[,next.listing.date := shift(first.listing.date,1,type='lead'),by='cc_property_id']
  bySpell[,next.last.listing.date := shift(last.listing.date,1,type='lead'),by='cc_property_id']
  bySpell[,relists.within.month := 0]
  bySpell[next.listing.date - last.listing.date < 30 & has.sale == 0, relists.within.month := 1]
  bySpell[next.listing.date - last.listing.date < 30 & has.sale == 0, relists.within.month := 1]
  bySpell[,next.has.sale := shift(has.sale,1,type='lead'),by='cc_property_id']
  bySpell[,next.total.listings  := shift(total.listings,1,type='lead'),by='cc_property_id']
  bySpell[,next.sale.date := shift(sale.date,1,type='lead'),by='cc_property_id']
  bySpell[,next.sale.price  := shift(sale.price,1,type='lead'),by='cc_property_id']
  bySpell[relists.within.month == 0, has.sale.expansive := has.sale ]
  bySpell[relists.within.month == 0, total.listings.expansive := total.listings]
  bySpell[relists.within.month == 0, sale.price.expansive := sale.price]
  bySpell[relists.within.month == 0, days.first.sale.listing.expansive := sale.date - first.listing.date]
  bySpell[relists.within.month == 0, sale.to.first.listing.expansive := sale.to.first.listing]
  bySpell[relists.within.month == 0, last.listing.date.expansive := last.listing.date ]
  bySpell[relists.within.month == 0, sale.date.expansive := sale.date ]
  bySpell[relists.within.month == 1, has.sale.expansive := next.has.sale ]
  bySpell[relists.within.month == 1, total.listings.expansive := next.total.listings + total.listings]
  bySpell[relists.within.month == 1, sale.price.expansive := next.sale.price]
  bySpell[relists.within.month == 1, days.first.sale.listing.expansive := next.sale.date - first.listing.date]
  bySpell[relists.within.month == 1, sale.to.first.listing.expansive := I(sale.price.expansive / first.listing.price)]
  bySpell[relists.within.month == 1, last.listing.date.expansive := next.last.listing.date ]
  bySpell[relists.within.month == 1, sale.date.expansive := next.sale.date ]
  bySpell[,Lister := 'Other']
  bySpell[t.last.sale.first.list < 1, Lister := 'Flipper']
  bySpell[iBuyer.seller == 1,Lister := 'iBuyer']
  bySpell[,Lister := factor(Lister)]
  bySpell[,Lister := factor(Lister,levels(Lister)[c(3,1,2)])]
  bySpell[,qtr := quarter(first.listing.date)]
  
  # cl = censoring levels
  cl = 0.99
  ll = quantile(abs(bySpell$last.to.first.listing),cl,na.rm=T)
  aa = quantile(abs(bySpell$last.to.first.listing.ann),cl,na.rm=T)
  gg = quantile(abs(bySpell$first.list.to.last.sale),.95,na.rm=T)
  tt = quantile(abs(bySpell$total.listings),.99,na.rm=T)
  
  toReg <- bySpell[ first.listing.price > 0]
  toReg[,Lister := factor(Lister,levels = c('Other','Flipper','iBuyer'))]
  
  write.table(file = '../data/processed/mls/share/spells_processed.csv',x = toReg,row.names = F, col.names = T)
  
  
}

createProcessedTransactions()
createSpellsDataset()
